#' ---
#' title : " Project "
#' date : 4-12-2022
#' author : Karan, Sahil,Pranava,Vidhi
#'Importing packages
library(readr)
## Warning: package 'readr' was built under R version 4.2.2
library(data.table)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
library(rpart)
library(e1071)
library(party)
## Warning: package 'party' was built under R version 4.2.2
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.2.2
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.2.2
library(Epi)
## Warning: package 'Epi' was built under R version 4.2.2
library(ROCR)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:modeltools':
## 
##     Predict
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(VIM)
## Warning: package 'VIM' was built under R version 4.2.2
## Loading required package: colorspace
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(caTools)
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(scales) 
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(stringr) 
## 
## Attaching package: 'stringr'
## The following object is masked from 'package:strucchange':
## 
##     boundary
library(ggthemes) 
## Warning: package 'ggthemes' was built under R version 4.2.2
#' Import Data
data = read_csv("C:/Users/karan/OneDrive/Desktop/IIT/Data Preparation and Analysis/Group Project/bank-full.csv")
## Rows: 45211 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): job, marital, education, default, housing, loan, contact, month, p...
## dbl  (7): age, balance, day, duration, campaign, pdays, previous
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data2 = data
head(data)
## # A tibble: 6 × 17
##     age job    marital educa…¹ default balance housing loan  contact   day month
##   <dbl> <chr>  <chr>   <chr>   <chr>     <dbl> <chr>   <chr> <chr>   <dbl> <chr>
## 1    58 manag… married tertia… no         2143 yes     no    unknown     5 may  
## 2    44 techn… single  second… no           29 yes     no    unknown     5 may  
## 3    33 entre… married second… no            2 yes     yes   unknown     5 may  
## 4    47 blue-… married unknown no         1506 yes     no    unknown     5 may  
## 5    33 unkno… single  unknown no            1 no      no    unknown     5 may  
## 6    35 manag… married tertia… no          231 yes     no    unknown     5 may  
## # … with 6 more variables: duration <dbl>, campaign <dbl>, pdays <dbl>,
## #   previous <dbl>, poutcome <chr>, y <chr>, and abbreviated variable name
## #   ¹​education
str(data)
## spc_tbl_ [45,211 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age      : num [1:45211] 58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : chr [1:45211] "management" "technician" "entrepreneur" "blue-collar" ...
##  $ marital  : chr [1:45211] "married" "single" "married" "married" ...
##  $ education: chr [1:45211] "tertiary" "secondary" "secondary" "unknown" ...
##  $ default  : chr [1:45211] "no" "no" "no" "no" ...
##  $ balance  : num [1:45211] 2143 29 2 1506 1 ...
##  $ housing  : chr [1:45211] "yes" "yes" "yes" "yes" ...
##  $ loan     : chr [1:45211] "no" "no" "yes" "no" ...
##  $ contact  : chr [1:45211] "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : num [1:45211] 5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr [1:45211] "may" "may" "may" "may" ...
##  $ duration : num [1:45211] 261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : num [1:45211] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : num [1:45211] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : num [1:45211] 0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr [1:45211] "unknown" "unknown" "unknown" "unknown" ...
##  $ y        : chr [1:45211] "no" "no" "no" "no" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   job = col_character(),
##   ..   marital = col_character(),
##   ..   education = col_character(),
##   ..   default = col_character(),
##   ..   balance = col_double(),
##   ..   housing = col_character(),
##   ..   loan = col_character(),
##   ..   contact = col_character(),
##   ..   day = col_double(),
##   ..   month = col_character(),
##   ..   duration = col_double(),
##   ..   campaign = col_double(),
##   ..   pdays = col_double(),
##   ..   previous = col_double(),
##   ..   poutcome = col_character(),
##   ..   y = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(data)
##       age            job              marital           education        
##  Min.   :18.00   Length:45211       Length:45211       Length:45211      
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.94                                                           
##  3rd Qu.:48.00                                                           
##  Max.   :95.00                                                           
##    default             balance         housing              loan          
##  Length:45211       Min.   : -8019   Length:45211       Length:45211      
##  Class :character   1st Qu.:    72   Class :character   Class :character  
##  Mode  :character   Median :   448   Mode  :character   Mode  :character  
##                     Mean   :  1362                                        
##                     3rd Qu.:  1428                                        
##                     Max.   :102127                                        
##    contact               day           month              duration     
##  Length:45211       Min.   : 1.00   Length:45211       Min.   :   0.0  
##  Class :character   1st Qu.: 8.00   Class :character   1st Qu.: 103.0  
##  Mode  :character   Median :16.00   Mode  :character   Median : 180.0  
##                     Mean   :15.81                      Mean   : 258.2  
##                     3rd Qu.:21.00                      3rd Qu.: 319.0  
##                     Max.   :31.00                      Max.   :4918.0  
##     campaign          pdays          previous          poutcome        
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000   Length:45211      
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000   Class :character  
##  Median : 2.000   Median : -1.0   Median :  0.0000   Mode  :character  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803                     
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000                     
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000                     
##       y            
##  Length:45211      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#Checking dimensions of data 
dim(data)
## [1] 45211    17
#' Exploratory Data Analysis
# Yes and Output Analysis

data %>% 
  count(y) %>% 
  mutate(perc = n / nrow(data)) -> predictorData2

ggplot(predictorData2, aes(x = y, y = perc)) + geom_bar(stat = "identity")

# Job type analysis

data %>% 
  count(job) %>% 
  mutate(perc = n / nrow(data)) -> bank3

ggplot(bank3, aes(x = job, y = perc)) + geom_bar(stat = "identity")

# Job type analysis with output field

ggplot(data, 
       aes(x = job,
           fill = y)) + 
  geom_bar(position = "dodge")

# Marital type analysis

data %>% 
  count(marital) %>% 
  mutate(perc = n / nrow(data)) -> bank3

ggplot(bank3, aes(x = marital, y = perc)) + geom_bar(stat = "identity")

# Marital type analysis with output field

ggplot(data, 
       aes(x = marital,
           fill = y)) + 
  geom_bar(position = "dodge")

# Defaulter by bank field analysis 

data %>% 
  count(default) %>% 
  mutate(perc = n / nrow(data)) -> bank3

ggplot(bank3, aes(x = default, y = perc)) + geom_bar(stat = "identity")

# Defaulter by bank analysis with output field

ggplot(data, 
       aes(x = default,
           fill = y)) + 
  geom_bar(position = "dodge")

Age fieled analysis to identify age group which is contacted more by banks.

data2$AgeGroup <- cut(data2$age, 
                         breaks = c(-Inf,
                                    10,20,30,40,50,60,70,80,90,100
                                    , Inf), 
                         
                         labels = c("0-9 "
                                    ,"10-19 ","20-29","30-39","40-49"
                                    ,"50-59","60-69","70-79","80-89"
                                    ,"90-100","100 +"),
                         right = FALSE)


data2 %>% 
  count(AgeGroup) %>% 
  mutate(perc = n / nrow(data2)) -> bank3

ggplot(bank3, aes(x = AgeGroup, y = perc)) + geom_bar(stat = "identity")

Age fieled analysis to identify age group which is contacted more by banks with output fields

ggplot(data2, 
       aes(x = AgeGroup,
           fill = y)) + 
   geom_bar(position = "dodge")

# Housing field analysis 

data %>% 
  count(housing) %>% 
  mutate(perc = n / nrow(data)) -> bank3

ggplot(bank3, aes(x = housing, y = perc)) + geom_bar(stat = "identity")

# Housing field analysis  with output field

ggplot(data, 
       aes(x = housing,
           fill = y)) + 
  geom_bar(position = "dodge")

# Previous marketing outcome analysis

data %>% 
  count(poutcome) %>% 
  mutate(perc = n / nrow(data)) -> bank3

ggplot(bank3, aes(x = poutcome, y = perc)) + geom_bar(stat = "identity")

# Previous marketing outcome  analysis with output field

ggplot(data, 
       aes(x = poutcome,
           fill = y)) + 
  geom_bar(position = "dodge")

Month fields analysis shows in which month bank contact more with customers

Month Note- day of the week is required calculation and its not useful data as usually in any banking sector performance is calculated monthly and quarterly or yearly.

data %>% 
  count(month) %>% 
  mutate(perc = n / nrow(data),Month = factor(month, levels = c("jan", "feb", "mar", "apr", "may", "jun", 
                                                                "jul", "aug", "sep", "oct", "nov", "dec"))) -> bank3

ggplot(bank3, aes(x = Month, y = perc)) + geom_bar(stat = "identity")

# Month fields analysis shows in which month bank contact more with customers with output fields

bankMutate <- data %>%
  mutate(Month = factor(month, levels = c("jan", "feb", "mar", "apr", "may", "jun", 
                                          "jul", "aug", "sep", "oct", "nov", "dec")))

ggplot(bankMutate, 
       aes(x = Month,
           fill = y)) + 
  geom_bar(position = "dodge")

# Call duration analysis

ggplot_duration<- ggplotly(ggplot(data, aes(x=as.factor(y), y=duration)) +
                             geom_boxplot(fill='#A4A4A4', color="black")+
                             theme_classic())
ggplot_duration
#'' Data Manipulation and Descriptive Analysis
# Unique values in different columns
data%>% distinct(job)
## # A tibble: 12 × 1
##    job          
##    <chr>        
##  1 management   
##  2 technician   
##  3 entrepreneur 
##  4 blue-collar  
##  5 unknown      
##  6 retired      
##  7 admin.       
##  8 services     
##  9 self-employed
## 10 unemployed   
## 11 housemaid    
## 12 student
data %>% distinct(marital)
## # A tibble: 3 × 1
##   marital 
##   <chr>   
## 1 married 
## 2 single  
## 3 divorced
data %>% distinct(education)
## # A tibble: 4 × 1
##   education
##   <chr>    
## 1 tertiary 
## 2 secondary
## 3 unknown  
## 4 primary
data %>% distinct(default)
## # A tibble: 2 × 1
##   default
##   <chr>  
## 1 no     
## 2 yes
data %>% distinct(loan)
## # A tibble: 2 × 1
##   loan 
##   <chr>
## 1 no   
## 2 yes
data %>% distinct(contact)
## # A tibble: 3 × 1
##   contact  
##   <chr>    
## 1 unknown  
## 2 cellular 
## 3 telephone
data %>% distinct(poutcome)
## # A tibble: 4 × 1
##   poutcome
##   <chr>   
## 1 unknown 
## 2 failure 
## 3 other   
## 4 success
data %>% distinct(y)
## # A tibble: 2 × 1
##   y    
##   <chr>
## 1 no   
## 2 yes
# Job Column - Spelling error
data$job<-gsub(".", "", data$job, fixed = TRUE)

# Checking missing values in data set
colSums(is.na.data.frame(data))
##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0
# No missing values in data

# Substitute unknowns in the data set with NA
data[data == "unknown"] <- NA
#' Data Preparation
# Converting categorical target variable in yes/no form:
data$y<-ifelse(data$y=="yes",1,0)
#' Splitting Data
set.seed(1)
# Use 70% of data set as training set and 30% as test set
sample <- sample.split(data$y, SplitRatio = 0.7)
train  <- subset(data, sample == TRUE)
test   <- subset(data, sample == FALSE)
train<-train %>% mutate(y=as.factor(y))
test<-test %>% mutate(y=as.factor(y))

# Splitting train into X and Y train
x_train<-as.data.frame(train) %>% select(-y)
y_train<-train$y
# Splitting test into X and Y train
x_test<-as.data.frame(test) %>% select(-y)
y_test<-test$y
#' Imbalanced Data set
# We need to balance our data
table(y_train)
## y_train
##     0     1 
## 27945  3702
set.seed(123)
train_downsample <- downSample(x = x_train,y = y_train,yname = "y")
train_downsample<-train_downsample %>% mutate(y=as.factor(y))
#' Imbalanced Data set
# We need to balance our data
table(y_train)
## y_train
##     0     1 
## 27945  3702
set.seed(123)
train_downsample <- downSample(x = x_train,y = y_train,yname = "y")
train_downsample<-train_downsample %>% mutate(y=as.factor(y))
#' Logistic Regression
model_full<-glm(y~.,train_downsample, family = "binomial")
summary(model_full)
## 
## Call:
## glm(formula = y ~ ., family = "binomial", data = train_downsample)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6981  -0.4844   0.2424   0.5487   2.1639  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -2.456e+00  6.724e-01  -3.653 0.000260 ***
## age                 4.024e-03  8.902e-03   0.452 0.651238    
## jobblue-collar     -2.201e-01  2.652e-01  -0.830 0.406624    
## jobentrepreneur    -5.843e-01  5.021e-01  -1.164 0.244518    
## jobhousemaid       -6.426e-01  5.536e-01  -1.161 0.245738    
## jobmanagement      -1.596e-01  2.761e-01  -0.578 0.563272    
## jobretired         -1.684e-01  3.785e-01  -0.445 0.656442    
## jobself-employed   -1.630e-01  4.157e-01  -0.392 0.694892    
## jobservices         4.076e-02  3.258e-01   0.125 0.900445    
## jobstudent          3.603e-01  4.325e-01   0.833 0.404749    
## jobtechnician       4.431e-02  2.477e-01   0.179 0.858010    
## jobunemployed       8.166e-02  4.446e-01   0.184 0.854267    
## maritalmarried      2.262e-01  2.228e-01   1.015 0.309973    
## maritalsingle       2.094e-01  2.622e-01   0.799 0.424577    
## educationsecondary  3.454e-01  2.534e-01   1.363 0.172823    
## educationtertiary   9.190e-01  3.014e-01   3.049 0.002299 ** 
## defaultyes         -9.174e-01  1.052e+00  -0.872 0.383352    
## balance             8.469e-06  2.342e-05   0.362 0.717678    
## housingyes         -1.092e+00  1.647e-01  -6.631 3.33e-11 ***
## loanyes            -2.390e-01  2.389e-01  -1.000 0.317081    
## contacttelephone   -3.651e-01  2.826e-01  -1.292 0.196417    
## day                 1.480e-02  9.826e-03   1.506 0.131949    
## monthaug            1.307e+00  3.224e-01   4.053 5.05e-05 ***
## monthdec            3.021e-01  5.177e-01   0.584 0.559528    
## monthfeb            3.399e-01  3.038e-01   1.119 0.263174    
## monthjan           -4.140e-01  3.729e-01  -1.110 0.266952    
## monthjul            1.707e+00  4.675e-01   3.652 0.000260 ***
## monthjun            1.783e+00  4.197e-01   4.248 2.16e-05 ***
## monthmar            1.998e+00  5.096e-01   3.920 8.85e-05 ***
## monthmay           -1.317e-01  2.539e-01  -0.519 0.603968    
## monthnov            2.330e-01  2.882e-01   0.809 0.418800    
## monthoct            1.023e+00  3.435e-01   2.977 0.002911 ** 
## monthsep            1.242e+00  3.617e-01   3.434 0.000594 ***
## duration            6.145e-03  4.399e-04  13.970  < 2e-16 ***
## campaign           -1.788e-01  5.664e-02  -3.156 0.001600 ** 
## pdays               7.503e-04  6.122e-04   1.226 0.220374    
## previous            2.260e-02  2.056e-02   1.099 0.271773    
## poutcomeother       2.558e-01  1.822e-01   1.404 0.160314    
## poutcomesuccess     2.051e+00  1.849e-01  11.095  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2278.2  on 1821  degrees of freedom
## Residual deviance: 1331.7  on 1783  degrees of freedom
##   (5582 observations deleted due to missingness)
## AIC: 1409.7
## 
## Number of Fisher Scoring iterations: 6
# Important coefficients in Model
varImp(model_full, scale = FALSE)
##                       Overall
## age                 0.4520434
## jobblue-collar      0.8298500
## jobentrepreneur     1.1637675
## jobhousemaid        1.1607643
## jobmanagement       0.5779882
## jobretired          0.4448314
## jobself-employed    0.3922245
## jobservices         0.1250987
## jobstudent          0.8331692
## jobtechnician       0.1789080
## jobunemployed       0.1836766
## maritalmarried      1.0152796
## maritalsingle       0.7985060
## educationsecondary  1.3631889
## educationtertiary   3.0486142
## defaultyes          0.8717375
## balance             0.3615638
## housingyes          6.6313529
## loanyes             1.0004740
## contacttelephone    1.2918284
## day                 1.5064608
## monthaug            4.0531281
## monthdec            0.5835426
## monthfeb            1.1189196
## monthjan            1.1101095
## monthjul            3.6518209
## monthjun            4.2477855
## monthmar            3.9200684
## monthmay            0.5187031
## monthnov            0.8085047
## monthoct            2.9769592
## monthsep            3.4344076
## duration           13.9696739
## campaign            3.1559215
## pdays               1.2255330
## previous            1.0989886
## poutcomeother       1.4040177
## poutcomesuccess    11.0951074
# Predictions
predicted_log<-predict(model_full,x_test,type="response")
pred_log_test <- ifelse(predicted_log > 0.5, 1, 0) %>% as.factor()
cm_log<-confusionMatrix(pred_log_test,y_test,positive = "1")
cm_log
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1339   53
##          1  487  490
##                                           
##                Accuracy : 0.7721          
##                  95% CI : (0.7546, 0.7888)
##     No Information Rate : 0.7708          
##     P-Value [Acc > NIR] : 0.4531          
##                                           
##                   Kappa : 0.4963          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9024          
##             Specificity : 0.7333          
##          Pos Pred Value : 0.5015          
##          Neg Pred Value : 0.9619          
##              Prevalence : 0.2292          
##          Detection Rate : 0.2068          
##    Detection Prevalence : 0.4124          
##       Balanced Accuracy : 0.8178          
##                                           
##        'Positive' Class : 1               
## 
#' Naive Bayes Classifier
model_naive <- naiveBayes(formula = y ~ ., data = train_downsample,laplace=1)
summary(model_naive)
##           Length Class  Mode     
## apriori    2     table  numeric  
## tables    16     -none- list     
## levels     2     -none- character
## isnumeric 16     -none- logical  
## call       4     -none- call
# Predictions
naive_pred = predict(model_naive,x_test,type="class" )
naive_log = confusionMatrix(table(naive_pred,y_test))
naive_log
## Confusion Matrix and Statistics
## 
##           y_test
## naive_pred     0     1
##          0 10084   415
##          1  1893  1172
##                                           
##                Accuracy : 0.8298          
##                  95% CI : (0.8234, 0.8361)
##     No Information Rate : 0.883           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4134          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8419          
##             Specificity : 0.7385          
##          Pos Pred Value : 0.9605          
##          Neg Pred Value : 0.3824          
##              Prevalence : 0.8830          
##          Detection Rate : 0.7434          
##    Detection Prevalence : 0.7740          
##       Balanced Accuracy : 0.7902          
##                                           
##        'Positive' Class : 0               
##